Using time-course RNASeq data from Ophiocordyceps camponoti-floridani liquid culture:
Dataset: O. camponoti-floridani grown in liquid culture as blastspore (three pooled per time point for RNA-extraction and -sequencing), collected every 2h, over a 24h-period. [Control]
# loading database which contains data for Das and de Bekker 2022, from GitHub
db <- dbConnect(RSQLite::SQLite(), paste0(path_to_repo,"/data/databases/TC6_fungal_data.db"))
# specify sample name
sample.name <- "ophio_cflo"
# extract the (gene-expr X time-point) data
dat <-
db %>%
tbl(., paste0(sample.name ,"_fpkm")) %>%
select(gene_name, everything()) %>%
collect()
writeLines("What is the dimensions of the original dataset? [Rows = #genes, Cols = #samples]")
## What is the dimensions of the original dataset? [Rows = #genes, Cols = #samples]
dim(dat[-1])
## [1] 7455 12
The above dataset contains all genes ??(n=13,808)?? (7455 genes, 12 samples) in the fungi genome. However, not all of these genes are expressed, and some are expressed at very low levels that are not biologically meaningful. Therefore, we will only keep the genes that are “expressed” (≥1 FPKM for at least half of all time points).
# Which genes are expressed throughout the day in Ophio-cflo?
# count the number of time points that has ≥ 1 FPKM
n.expressed <- apply(dat[-1], 1, function(x) sum(x >= 1))
# subset the data and only keep the filtered genes
dat <- dat[which(n.expressed >=6),]
writeLines("Dimensions of the data post-filtering step [Rows = #genes, Cols = #samples]")
## Dimensions of the data post-filtering step [Rows = #genes, Cols = #samples]
dim(dat)
## [1] 6874 13
This is our cleaned, input data file for building the circadian GCN. This contains 6874 genes in ??13?? samples.
datExpr = as.data.frame(t(log2(dat[-c(1)]+1)))
names(datExpr) = dat$gene_name
rownames(datExpr) = names(dat)[-c(1)]
# USE THE FOLLOWING CODE TO CHECK IF YOU HAVE ANY BAD SAMPLES #
# gsg = goodSamplesGenes(datExpr, verbose = 3);
# gsg$allOK
# sampleTree = hclust(dist(datExpr0), method = "average");
# # Plot the sample tree: Open a graphic output window of size 12 by 9 inches
# # The user should change the dimensions if the window is too large or too small.
# sizeGrWindow(12,9)
# #pdf(file = "Plots/sampleClustering.pdf", width = 12, height = 9);
# par(cex = 1);
# par(mar = c(0,4,2,0))
# plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,
# cex.axis = 1.5, cex.main = 2)
# save the number of genes and samples
# that will be used to create the circadian GCN
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
# visualize the log-transformed data
x = reshape2::melt(as.matrix(t(datExpr)))
colnames(x) = c('gene_id', 'sample', 'value')
writeLines("Visualizing the log-transformed data")
## Visualizing the log-transformed data
ggplot(x, aes(x=value, color=sample)) + geom_density() + theme_Publication()
# Calculate Kendall's tau-b correlation for each gene-gene pair
# sim_matrix <- cor((datExpr), method = "kendall") # this step takes time
# save(sim_matrix, file = paste0(path_to_repo, "/results/temp_files/sim_matrix_", sample.name, "_TC6.RData")) # might be useful to save the sim_matrix and
load(paste0(path_to_repo, "/results/temp_files/sim_matrix_", sample.name, "_TC6.RData")) # load it up
## Let's display a chunk of the matrix (code from Hughitt 2016; github)
heatmap_indices <- sample(nrow(sim_matrix), 200)
writeLines(paste0("Plotting a chunk of the gene-gene similarity matrix with ", length(heatmap_indices), " genes."))
## Plotting a chunk of the gene-gene similarity matrix with 200 genes.
gplots::heatmap.2(t(sim_matrix[heatmap_indices, heatmap_indices]),
col=inferno(100),
labRow=NA, labCol=NA,
trace='none', dendrogram='row',
xlab='Gene', ylab='Gene',
main= paste0("Similarity matrix \n correlation method = 'kendall' \n (", length(heatmap_indices), "random genes)"),
density.info='none', revC=TRUE)
writeLines("Performing network topology analysis to pick soft-thresholding power")
## Performing network topology analysis to pick soft-thresholding power
# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=30, by=2))
# # Call the network topology analysis function
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
## pickSoftThreshold: will use block size 6508.
## pickSoftThreshold: calculating connectivity for given powers...
## ..working on genes 1 through 6508 of 6874
## ..working on genes 6509 through 6874 of 6874
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.5520 2.470 0.966 2430.00 2420.00 3430.0
## 2 2 0.0959 0.373 0.927 1220.00 1170.00 2210.0
## 3 3 0.1380 -0.330 0.904 716.00 656.00 1590.0
## 4 4 0.4970 -0.679 0.932 462.00 401.00 1210.0
## 5 5 0.6770 -0.910 0.940 318.00 260.00 952.0
## 6 6 0.7660 -1.030 0.953 229.00 175.00 772.0
## 7 7 0.8130 -1.130 0.958 170.00 123.00 638.0
## 8 8 0.8460 -1.190 0.968 130.00 89.00 536.0
## 9 9 0.8610 -1.240 0.969 102.00 66.00 458.0
## 10 10 0.8700 -1.300 0.968 81.40 49.70 396.0
## 11 12 0.8760 -1.380 0.965 54.00 29.50 304.0
## 12 14 0.8630 -1.440 0.950 37.60 18.40 240.0
## 13 16 0.8740 -1.480 0.957 27.20 12.00 194.0
## 14 18 0.8790 -1.500 0.963 20.20 8.17 159.0
## 15 20 0.8930 -1.510 0.971 15.40 5.73 132.0
## 16 22 0.9100 -1.510 0.981 12.00 4.08 112.0
## 17 24 0.9200 -1.510 0.988 9.47 2.98 95.8
## 18 26 0.9180 -1.520 0.986 7.60 2.21 82.7
## 19 28 0.9100 -1.550 0.984 6.18 1.65 72.3
## 20 30 0.9110 -1.570 0.985 5.09 1.26 64.3
writeLines("Plotting the resutls from the network topology analysis")
## Plotting the resutls from the network topology analysis
# Plot the results:
# sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
NOTE: The scale-free topology fit index reaches ~0.846 at a soft-thresholding-power=8, and it gets saturated after that. R2 is used to quentify how well a scale-free netork satisfies, ranging from 0 (non-scale free topology) to 1 (scale free topology). Networks in a biological contest do resemble scale-free topology. When we maximize for scale-free model fit, there is a natural trade-off between mean number of connections. We want to pick a soft thresshold that satifies the scale-free network while maintaning the highest connectivity.
Now, we can go ahead and create our adjacency matrix by power-transforming the similarity matrix (see WGCNA for more info).
## Specify the soft-thresholding-power which has been set at 8 for Ophio-cflo
soft.power = 8
# Construct adjacency matrix
# adj_matrix <- adjacency.fromSimilarity(sim_matrix,
# power=soft.power,
# type='signed')
# save(adj_matrix, file = paste0(path_to_repo, "/results/temp_files/adj_matrix_", sample.name, "_TC6.RData")) # might be useful to save the sim_matrix and
load(paste0(path_to_repo,"/results/temp_files/adj_matrix_", sample.name,"_TC6.RData")) # load it up
# Convert adj_matrix to matrix
gene_ids <- rownames(adj_matrix)
adj_matrix <- matrix(adj_matrix, nrow=nrow(adj_matrix))
rownames(adj_matrix) <- gene_ids
colnames(adj_matrix) <- gene_ids
writeLines(paste0("Plotting the power-transformed adjacency matrix for the same ", length(heatmap_indices)," genes as above"))
## Plotting the power-transformed adjacency matrix for the same 200 genes as above
## Same heatmap as before, but now with the power-transformed adjacency matrix
gplots::heatmap.2(t(adj_matrix[heatmap_indices, heatmap_indices]),
col=inferno(100),
labRow=NA, labCol=NA,
trace='none', dendrogram='row',
xlab='Gene', ylab='Gene',
main='Adjacency matrix',
density.info='none', revC=TRUE)
## Delete similarity matrix to free up memory
rm(sim_matrix)
# gc()
The following steps are performed as per guidelines from the WGCNA package and several tutorials made available online.
# Turn adjacency into topological overlap
# TOM = TOMsimilarity(adj_matrix);
# dissTOM = 1-TOM
# save(dissTOM, file = paste0(path_to_repo, "/results/temp_files/dissTOM_", sample.name, "_TC6.RData")) # might be useful to save the sim_matrix and
load(paste0(path_to_repo, "/results/temp_files/dissTOM_", sample.name, "_TC6.RData")) # load it up
# Call the hierarchical clustering function
geneTree = hclust(as.dist(dissTOM), method = "average")
writeLines("Plotting the resulting clustering tree (dendrogram)")
## Plotting the resulting clustering tree (dendrogram)
# sizeGrWindow(12,9)
par(mfrow=c(1,1))
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04)
User defined parameters:
# We like large modules, so we set the minimum module size relatively high:
minModuleSize = 50;
# Module identification using dynamic tree cut:
dynamicMods= cutreeDynamic(dendro = geneTree,
distM = dissTOM,
method = "hybrid",
verbose = 4,
deepSplit = 3, # see WGCNA for more info on tuning parameters
pamRespectsDendro = FALSE,
minClusterSize = minModuleSize);
## ..cutHeight not given, setting it to 0.988 ===> 99% of the (truncated) height range in dendro.
## ..Going through the merge tree
##
## ..Going through detected branches and marking clusters..
## ..Assigning Tree Cut stage labels..
## ..Assigning PAM stage labels..
## ....assigned 3086 objects to existing clusters.
## ..done.
# view number of genes in each module
table(dynamicMods)
## dynamicMods
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 1169 865 568 419 361 316 308 292 291 284 276 163 144 137 135 104
## 17 18 19 20 21 22 23 24 25 26 27 28 29 30
## 101 98 92 88 80 79 75 74 68 65 65 54 53 50
writeLines("How many genes are there in each of the initial modules (clusters) detected?
Note: The names of the modules (colors) have no meaning.")
## How many genes are there in each of the initial modules (clusters) detected?
## Note: The names of the modules (colors) have no meaning.
# Convert numeric lables into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
## dynamicColors
## black blue brown cyan darkgreen
## 308 865 568 137 79
## darkgrey darkorange darkred darkturquoise green
## 74 65 80 75 361
## greenyellow grey60 lightcyan lightgreen lightyellow
## 276 101 104 98 92
## magenta midnightblue orange pink purple
## 291 135 68 292 284
## red royalblue saddlebrown salmon skyblue
## 316 88 53 144 54
## steelblue tan turquoise white yellow
## 50 163 1169 65 419
User defined parameters:
# Calculate eigengenes
MEList = moduleEigengenes(datExpr, colors = dynamicColors)
MEs = MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs, method = "kendall");
writeLines("Clustering the module eigengenes and identifying a cutoff to merge similar modules")
## Clustering the module eigengenes and identifying a cutoff to merge similar modules
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average");
# Plot the result
# sizeGrWindow(7, 8)
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "MEDiss = 1-cor(MEs, method = 'kendall')")
# We choose a height cut of 0.3
MEDissThres = 0.3 # user-specified parameter value; see WGCNA manual for more info
# Plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")
writeLines(paste0("Merging modules that have a correlation ≥ ", 1-MEDissThres))
## Merging modules that have a correlation ≥ 0.7
# Call an automatic merging function
merge = mergeCloseModules(datExpr, dynamicColors, cutHeight = MEDissThres, verbose = 3)
## mergeCloseModules: Merging modules whose distance is less than 0.3
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 30 module eigengenes in given set.
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 18 module eigengenes in given set.
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 16 module eigengenes in given set.
## Calculating new MEs...
## multiSetMEs: Calculating module MEs.
## Working on set 1 ...
## moduleEigengenes: Calculating 16 module eigengenes in given set.
# The merged module colors
mergedColors = merge$colors;
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs;
writeLines("Plotting the identified clusters (denoted with colors) before and after merging.")
## Plotting the identified clusters (denoted with colors) before and after merging.
# sizeGrWindow(12, 9)
plotDendroAndColors(geneTree,
cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1
writeLines("Calculating module-module similarity based on module-eigengene-expression.")
## Calculating module-module similarity based on module-eigengene-expression.
# Calculate similarity of the eigen-genes
sim_matrix_ME <- cor(mergedMEs, method = "kendall")
# calculate adj_matrix
adj_matrix_ME <- adjacency.fromSimilarity(sim_matrix_ME,
power=1, # DO NOT power transform
type='signed'
)
## CHANGE THE NAMES OF THE MODULES;
module_ids <- data.frame(
old_labels = rownames(adj_matrix_ME) %>% str_split("ME", 2) %>% sapply("[", 2) %>% as.character(),
new_labels = paste0("OC", 1:nrow(adj_matrix_ME)))
# coerce into a matrix
adj_matrix_ME <- matrix(adj_matrix_ME, nrow=nrow(adj_matrix_ME))
rownames(adj_matrix_ME) <- module_ids$new_labels
colnames(adj_matrix_ME) <- module_ids$new_labels
writeLines("Plotting the adjacency matrix that shows module-module similarity in expression")
## Plotting the adjacency matrix that shows module-module similarity in expression
gplots::heatmap.2(t(adj_matrix_ME),
col=inferno(100),
# labRow=NA, labCol=NA,
trace='none', dendrogram='row',
xlab='', ylab='',
# main='Similarity matrix - MEs \n correlation method = "kendall")',
main='Adjacency matrix - MEs \n modified edge weights)',
density.info='none', revC=TRUE)
pacman::p_load(igraph)
adj_matrix_ME_igraph <- adj_matrix_ME
# get rid of low correlations (0.6 & 0.8 are arbitrary) [0.7 and 0.9]
adj_matrix_ME_igraph[adj_matrix_ME_igraph < 0.6] <- 0
adj_matrix_ME_igraph[adj_matrix_ME_igraph < 0.8 & adj_matrix_ME_igraph>0] <- 0.5
adj_matrix_ME_igraph[adj_matrix_ME_igraph >= 0.8] <- 1
# build_network
network <- graph.adjacency(adj_matrix_ME_igraph,
mode = "upper",
weighted = T,
diag = F)
# simplify network
network <- igraph::simplify(network) # removes self-loops
# remove isolated vertices (keep only the nodes)
isolated <- which(degree(network)==0)
network <- igraph::delete.vertices(network, isolated)
# E(network)$width <- E(network)$weight + min(E(network)$weight) + 1 # offset=1
# colors <- as.character(module_ids$old_labels)
# V(network)$color <- colors
V(network)$color <- "white"
# genes_ME <- factor(moduleColors, levels=colors) %>% summary()
V(network)$size <- igraph::degree(network, mode = "all")^1.75
# V(network)$size <- log2(genes_ME)^1.3
V(network)$label.color <- "black"
V(network)$frame.color <- "black"
E(network)$width <- E(network)$weight^2*4
E(network)$color <- "black"
# ## highlight shortest paths between two vetices
# short.path <- igraph::get.shortest.paths(network, "S_5", "S_15")
# E(network, path = unlist(short.path[[1]]))$color <- col.scheme[2]
# E(network, path = unlist(short.path[[1]]))$width <- E(network)$weight*8
writeLines("Visualizing a simplified representation of the circadian GCN, with and without labels")
## Visualizing a simplified representation of the circadian GCN, with and without labels
par(mfrow = c(1,1))
## Circular layout
# png(paste0(path_to_repo, "/results/figures/", sample.name[1], "/", script.name,"/", sample.name[1],"_GCN_1.png"),
# width = 20, height = 30, units = "cm", res = 1000)
# par(bg=NA)
plot(network,
layout=layout.kamada.kawai,
# layout=layout.fruchterman.reingold,
# layout=layout.graphopt,
# layout=layout_in_circle,
vertex.label=NA
# vertex.size=hub.score(network)$vector*30
# vertex.shape="none"
)
# dev.off()
# png(paste0(path_to_repo, "/results/figures/", sample.name[1], "/", script.name,"/", sample.name[1],"_GCN_2.png"),
# width = 20, height = 30, units = "cm", res = 600)
# par(bg=NA)
plot(network,
size=20,
layout=layout.kamada.kawai,
# layout=layout.fruchterman.reingold
# layout=layout.graphopt
# layout=layout_in_circle,
# vertex.label=NA
# vertex.size=hub.score(network)$vector*30
vertex.shape="none"
)
# dev.off()
par(mfrow = c(1,1))
# Make a list that returns gene names for a given cluster
module_genes <- list()
modules.to.exclude <- c("")
# modules.to.exclude <- c(paste0("OC",c(4,6,9,12:16)))
which.modules <- module_ids %>% filter(!new_labels %in% modules.to.exclude) %>% pull(old_labels)
which.labels <- module_ids %>% filter(!new_labels %in% modules.to.exclude) %>% pull(new_labels)
# Get the genes from each of the modules
for (i in 1:length(which.modules)) {
# which color
mod.color = as.character(which.modules[[i]])
# subset
module_genes[[i]] <- names(datExpr)[which(moduleColors==mod.color)]
names(module_genes)[[i]] <- as.character(which.labels[[i]])
}
# # check the result | works
# names(module_genes)
# module_genes['OC12']
# [13 Dec 2021]
# Save a csv with the module identity information for all genes used in building the GCN
# make a dataframe with gene_name and module_identity
for (i in 1:length(module_genes)){
if (i == 1){
ocflo.control.mods <- data.frame(gene_name = module_genes[[i]],
module_identity = as.character(names(module_genes)[i]))
}
else{
foo <- data.frame(gene_name = module_genes[[i]],
module_identity = as.character(names(module_genes)[i]))
ocflo.control.mods <- rbind(ocflo.control.mods, foo)
}
}
# # save the dataframe as a csv
# cflo.control.mods %>%
# left_join(module_ids, by = c("module_identity" = "new_labels")) %>%
# write.csv(.,
# paste0(path_to_repo,
# "/results/WGCNA/cflo/cflo_heads_control_module_identity_new_labels.csv"),
# row.names = F)
# # done.
# # save a copy with all the gene annotations
# # load Cflo gene annotations
# cflo_annots <- read.csv(paste0(path_to_repo,"/functions/func_data/cflo_annots.csv"),
# header=T, stringsAsFactors = F, na.strings = c(NA,""," "))
# cflo.control.mods %>%
# left_join(cflo_annots, by="gene_name") %>% head()
# write.csv(.,
# paste0(path_to_repo,
# "/results/WGCNA/cflo/cflo_heads_control_module_identity_new_labels_annots.csv"),
# row.names = F)
# load the list of all genes of interest
load(file = paste0(path_to_repo,"/results/genes_of_interest/goi_list.RData"))
# sapply(goi.list, class)
Full comparison
writeLines("#####################################################
How many genes are in each of my geneset of interest?
#####################################################")
## #####################################################
## How many genes are in each of my geneset of interest?
## #####################################################
## MAKE YOUR LIST OF GENES OF INTEREST ##
# LIST ONE - WGCNA modules
list1 <- module_genes
writeLines("List of interesting genes #1
----------------------------
Genes in each of the identified gene-clusters or modules")
## List of interesting genes #1
## ----------------------------
## Genes in each of the identified gene-clusters or modules
sapply(list1, length)
## OC1 OC2 OC3 OC4 OC5 OC6 OC7 OC8 OC9 OC10 OC11 OC12 OC13 OC14 OC15 OC16
## 1144 1316 317 68 241 226 857 361 144 173 1169 370 74 50 276 88
## LIST TWO - all genes of interst
list2 <- list(gois.tc6[[1]],
gois.tc6[[2]],
gois.tc6[[3]],
gois.tc6[[4]],
gois.tc6[[5]],
gois.tc6[[6]],
gois.tc6[[7]],
gois.tc6[[8]],
gois.tc6[[9]],
gois.tc6[[10]],
gois.tc6[[11]],
gois.tc6[[12]]
)
writeLines("List of interesting genes #2
----------------------------")
## List of interesting genes #2
## ----------------------------
names(list2) <- c("rhy24-Ocflo",
"rhy24-Ocflo-Okim",
"rhy24-Ocflo|Beau-cluster3",
"rhy24-Ocflo|Beau-cluster4",
"Ocflo-inf-UP-all",
"Ocflo-inf-DOWN-all",
"Ocflo-inf-manip-UP",
"Ocflo-inf-manip-DOWN",
"Ocflo-DEGs-inf-only",
"Ocflo-inf-DOWN-manip-UP",
"Ocflo-manip-UP-all",
"Ocflo-manip-DOWN-all"
)
sapply(list2, length)
## rhy24-Ocflo rhy24-Ocflo-Okim rhy24-Ocflo|Beau-cluster3
## 2285 357 451
## rhy24-Ocflo|Beau-cluster4 Ocflo-inf-UP-all Ocflo-inf-DOWN-all
## 324 318 395
## Ocflo-inf-manip-UP Ocflo-inf-manip-DOWN Ocflo-DEGs-inf-only
## 190 161 362
## Ocflo-inf-DOWN-manip-UP Ocflo-manip-UP-all Ocflo-manip-DOWN-all
## 116 1867 1625
## CHECK FOR OVERLAP
## make a GOM object
gom.1v2 <- newGOM(list1, list2,
genome.size = nGenes)
png(paste0(path_to_repo, "/results/figures/", sample.name[1],"_gom_1v2.png"),
width = 50, height = 30, units = "cm", res = 300)
drawHeatmap(gom.1v2,
adj.p=T,
cutoff=0.05,
what="odds.ratio",
# what="Jaccard",
log.scale = T,
note.col = "grey80")
trash <- dev.off()
# writeLines("How many genes exactly are overlapping between the pairwise comparisons")
# getMatrix(gom.1v4, name = "intersection") %>% t()
writeLines("Visualizing the significant overlaps between your lists of interesting genes and the identified modules")
## Visualizing the significant overlaps between your lists of interesting genes and the identified modules
Where are my genesets of interests?
Focused comparison
writeLines("#####################################################
How many genes are in each of my geneset of interest?
#####################################################")
## #####################################################
## How many genes are in each of my geneset of interest?
## #####################################################
## MAKE YOUR LIST OF GENES OF INTEREST ##
# LIST ONE - WGCNA modules
list1 <- module_genes
writeLines("List of interesting genes #1
----------------------------
Genes in each of the identified gene-clusters or modules")
## List of interesting genes #1
## ----------------------------
## Genes in each of the identified gene-clusters or modules
sapply(list1, length)
## OC1 OC2 OC3 OC4 OC5 OC6 OC7 OC8 OC9 OC10 OC11 OC12 OC13 OC14 OC15 OC16
## 1144 1316 317 68 241 226 857 361 144 173 1169 370 74 50 276 88
## LIST THREE - focused genes of interest
list3 <- list(gois.tc6[[1]],
gois.tc6[[2]],
gois.tc6[[3]],
gois.tc6[[4]],
# gois.tc6[[5]],
# gois.tc6[[6]],
gois.tc6[[7]],
gois.tc6[[8]],
gois.tc6[[9]],
gois.tc6[[10]]
# gois.tc6[[11]],
# gois.tc6[[12]]
)
writeLines("List of interesting genes #2
----------------------------")
## List of interesting genes #2
## ----------------------------
names(list3) <- c("rhy24-Ocflo",
"rhy24-Ocflo-Okim",
"rhy24-Ocflo|Beau-cluster3",
"rhy24-Ocflo|Beau-cluster4",
# "Ocflo-inf-UP-all",
# "Ocflo-inf-DOWN-all",
"Ocflo-inf-manip-UP",
"Ocflo-inf-manip-DOWN",
"Ocflo-DEGs-inf-only",
"Ocflo-inf-DOWN-manip-UP"
# "Ocflo-manip-UP-all",
# "Ocflo-manip-DOWN-all"
)
sapply(list3, length)
## rhy24-Ocflo rhy24-Ocflo-Okim rhy24-Ocflo|Beau-cluster3
## 2285 357 451
## rhy24-Ocflo|Beau-cluster4 Ocflo-inf-manip-UP Ocflo-inf-manip-DOWN
## 324 190 161
## Ocflo-DEGs-inf-only Ocflo-inf-DOWN-manip-UP
## 362 116
## CHECK FOR OVERLAP
## make a GOM object
gom.1v3 <- newGOM(list1, list3,
genome.size = nGenes)
png(paste0(path_to_repo, "/results/figures/", sample.name[1],"_gom_1v3.png"),
width = 30, height = 30, units = "cm", res = 300)
drawHeatmap(gom.1v3,
adj.p=T,
cutoff=0.05,
what="odds.ratio",
# what="Jaccard",
log.scale = T,
note.col = "grey80")
trash <- dev.off()
# writeLines("How many genes exactly are overlapping between the pairwise comparisons")
# getMatrix(gom.1v4, name = "intersection") %>% t()
writeLines("Visualizing the significant overlaps between your lists of interesting genes and the identified modules")
## Visualizing the significant overlaps between your lists of interesting genes and the identified modules
Where are my genesets of interests?
From WGCNA-tutorial
“We begin by calculating the intramodular connectivity for each gene. (In network literature, connectivity is often referred to as”degree”.) The function intramodularConnectivity computes:
# From what I can tell, colorh1 in the tutorial refers to moduleColors
colorh1 <- moduleColors
# Calculate the connectivities of the genes
Alldegrees1=intramodularConnectivity(adjMat = adj_matrix, colors = colorh1)
Alldegrees1 <-
Alldegrees1 %>%
rownames_to_column("gene_name") %>%
mutate_at(vars(starts_with("k")),
function(x){
round(x,2)
})
head(Alldegrees1)
## gene_name kTotal kWithin kOut kDiff
## 1 GQ602_000001 257.04 163.01 94.02 68.99
## 2 GQ602_000002 135.90 59.67 76.23 -16.56
## 3 GQ602_000003 175.23 32.82 142.41 -109.59
## 4 GQ602_000004 81.25 11.45 69.80 -58.36
## 5 GQ602_000005 104.26 15.93 88.33 -72.40
## 6 GQ602_000006 96.75 40.82 55.93 -15.11
Plotting the mean (± 95% CI) connectivity of genes in different modules
pd <- position_dodge(0.1)
Alldegrees1 %>%
left_join(ocflo.control.mods, by="gene_name") %>%
# PLOT FROM RAW DATA
mutate(module_identity = factor(module_identity,
levels = paste0("OC",1:nrow(module_ids)))) %>%
ggplot(aes(x=module_identity, y=kTotal)) +
# ggplot(aes(x=module_identity, y=kWithin)) +
# geom_hline(yintercept = 45, col="darkgrey", alpha=0.8) +
# geom_hline(yintercept = 30, col="darkgrey", alpha=0.8) +
# geom_hline(yintercept = 75, col="darkgrey", alpha=0.8) +
geom_boxplot(fill="lightgrey",
alpha=0.6,
outlier.color = "grey60",
position = "dodge2") +
theme_Publication() +
scale_colour_Publication() +
xlab("") +
ggtitle("") +
ylab("Total connectivity") +
# ylab("Intramodular connectivity") +
theme(text = element_text(size = 20, colour = 'black'),
legend.position = "none",
axis.line.y = element_line(colour = "transparent",size=1)) +
coord_flip()
There are several things we could output from our analyses, we decided to report the following in the supplementary file:
module_pfams <- list()
bg.genes <- dat[[1]] ## all genes used to make the network
for (i in 1:length(which.labels)) {
# get name of the module
m <- which.labels[[i]]
which.test <- "pfams"
# save the enrichment results
module_pfams[[i]] <-
module_genes[[m]] %>%
check_enrichment(.,
bg = dat[[1]],
what = which.test,
clean = T,
expand = T,
plot = F)
}
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "No enriched terms found; can't expand."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [1] "No enriched terms found; can't expand."
## Now clean it up
sapply(module_pfams, nrow)
## [[1]]
## [1] 475
##
## [[2]]
## [1] 12
##
## [[3]]
## [1] 6
##
## [[4]]
## [1] 52
##
## [[5]]
## [1] 5
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## [1] 123
##
## [[9]]
## NULL
##
## [[10]]
## [1] 75
##
## [[11]]
## [1] 0
##
## [[12]]
## [1] 5
##
## [[13]]
## [1] 14
##
## [[14]]
## [1] 22
##
## [[15]]
## [1] 124
##
## [[16]]
## [1] 0
for (i in 1:length(module_pfams)) {
if(is.null(nrow(module_pfams[[i]]))) {
paste(which.labels[[i]],"is null") %>% print()
} else if(nrow(module_pfams[[i]])==0) {
paste(which.labels[[i]], "is an empty tibble") %>% print()
} else {
if(i==1) {
module.pfams <- module_pfams[[i]]
} else {
module.pfams <- rbind(module.pfams, module_pfams[[i]])
}
}
}
## [1] "OC6 is null"
## [1] "OC7 is null"
## [1] "OC9 is null"
## [1] "OC11 is an empty tibble"
## [1] "OC16 is an empty tibble"
## change the name of the column
module.pfams <-
module.pfams %>%
select(gene_name, enriched_in_module=annot_desc) %>%
filter(enriched_in_module!="no_desc")
# check the output dataframe
module.pfams %>% head()
## # A tibble: 6 x 2
## # Groups: gene_name [6]
## gene_name enriched_in_module
## <chr> <chr>
## 1 GQ602_000010 MFS_1
## 2 GQ602_000016 MFS_1; Sugar_tr
## 3 GQ602_000018 Enterotoxin_a
## 4 GQ602_000112 zf-C2H2
## 5 GQ602_000135 Enterotoxin_a
## 6 GQ602_000344 FAD_binding_3
Let’s make the file
results.gcn <-
ocflo.control.mods %>%
pull(gene_name) %>%
## rhythmicity data
TC6_annotator() %>%
select(gene_name = ophio_gene, everything()) %>%
## cluster identity
left_join(ocflo.control.mods, by="gene_name") %>%
## add total connectivity data
left_join(Alldegrees1 %>% select(gene_name,kTotal), by="gene_name") %>%
## add the enriched pfam
left_join(module.pfams, by="gene_name") %>%
## Geneset: "rhy24-Ocflo-Okim"
mutate(rhy_ocflo_okim = ifelse(gene_name %in% gois.tc6[[2]], "yes", "no")) %>%
## Geneset: "rhy24-Ocflo|Beau-cluster3"
mutate(rhy_ocflo_beau_cluster3 = ifelse(gene_name %in% gois.tc6[[3]], "yes", "no")) %>%
## Geneset: "rhy24-Ocflo|Beau-cluster4"
mutate(rhy_ocflo_beau_cluster4 = ifelse(gene_name %in% gois.tc6[[4]], "yes", "no")) %>%
## order the columns and rows
select(module_identity, kTotal, enriched_in_module,
gene_name, gene_desc,
rhy_ocflo_okim, rhy_ocflo_beau_cluster3, rhy_ocflo_beau_cluster4,
everything()) %>%
arrange(module_identity, enriched_in_module, desc(kTotal))
## export it
write.csv(results.gcn,
file = paste0(path_to_repo, "/results/00_supplementary_files/07_Ocflo_GCN_results.csv"),
row.names = F)
results.gcn %>%
filter(module_identity=="OC1") %>%
# ## summarize the results
# group_by(inf_v_control, control_rhy24) %>%
# summarize(n()) %>%
## pull rhythmic genes that are up/down-regulated
# filter(inf_v_control=="up" & control_rhy24=="yes") %>%
filter(inf_v_control=="down" & control_rhy24=="yes") %>%
## run enrichments
pull(gene_name) %>%
check_enrichment(.,
bg = dat[[1]],
what = "pfams",
clean = T,
expand = T) %>%
## run stacked zplots
pull(gene_name) %>%
stacked.zplot_tc6(plot.mean = F, bg.alpha = 0.8, bg.lwd=1.5)
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."
## [[1]]